
cluster_thresh = 0.5

this_graph = list(
  
  open_dataset('data/fec_across_matches/a') %>%
    filter(gamma_contbr_addr > 0 | gamma_contbr_addr == -1, match_probability >= cluster_thresh) %>%
    select(year_cluster_l, year_cluster_r) %>%
    collect() %>%
    as.data.table(),

    open_dataset('data/fec_across_matches/p') %>%
    filter(gamma_po_num > 0 | gamma_po_num == -1, match_probability >= cluster_thresh) %>%
    select(year_cluster_l, year_cluster_r) %>%
    collect() %>%
    as.data.table()
  
) %>%
  rbindlist() %>%
  as.matrix() %>%
  graph_from_edgelist(directed = F)
  
comps = components(this_graph)

cluster_names = data.table(
  cluster = str_c(glue('e-'), comps$membership),
  year_cluster = names(comps$membership),
  key = 'year_cluster'
)

contributors = list()

for(year in seq(2008, 2020, 2)){
  print(glue('Working in {year}...'))
  
  all_year = open_dataset(glue('data/fec_across/cycle_table={year}')) %>% collect() %>% as.data.table()
  setkey(all_year, year_cluster)
  
  all_year = cluster_names[all_year, on = 'year_cluster']
  all_year[, cluster := fifelse(is.na(cluster), year_cluster, cluster)]
  
  setkey(all_year, cluster, contb_receipt_dt)
  
  contributors[[as.character(year)]] = all_year[, .SD[.N], by = cluster][, .(
    cluster,
    gender,
    contbr_nm_first,
    contbr_name_fm,
    contbr_nm_last,
    contbr_name_ls,
    po_num,
    contbr_addr,
    contbr_city,
    contbr_zip,
    contbr_st
  )][, cycle := year]
  
  all_year[ ,`:=`(state_partition = NULL, across_partition = NULL)]
  
  write_dataset(
    all_year,
    glue('data/fec_final/cycle_table={year}'),
    partitioning = c('contbr_st'),
    max_rows_per_file = 5e5
  )
  
}

contributors = rbindlist(contributors)

setkey(contributors, cluster, cycle)

unique_contributor_info = contributors[, .SD[.N], by = cluster][, cycle := NULL]

setkey(unique_contributor_info, cluster)

years_gave = dcast(
  contributors[, .(cluster, cycle)],
  cluster ~ cycle,
  fun.aggregate = length,
  value.var = 'cycle'
)

names(years_gave) = c('cluster', str_c('gave_', seq(2004,2020,2)))

out = unique_contributor_info[years_gave, on = 'cluster']

write_dataset(
  out,
  'data/fec_contributors',
  partitioning = 'contbr_st',
  max_rows_per_file = 5e5
)

